Data Inspection¶

In [1]:
import plotly
plotly.offline.init_notebook_mode()
In [2]:
import pandas as pd
from tqdm import tqdm

df_list = list()
chunk_iter = pd.read_csv(
    "../data/Total_Data_10Y_Top24.csv", 
    chunksize=100000, 
    dtype = {
        "CANCELLATION_CODE": str,
    }
)
for chunk in tqdm(chunk_iter):
    df_list.append(chunk)
df = pd.concat(df_list)
df.head()
130it [00:28,  4.57it/s]
Out[2]:
FL_DATE OP_UNIQUE_CARRIER TAIL_NUM ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_MARKET_ID ORIGIN ORIGIN_CITY_NAME DEST_AIRPORT_SEQ_ID DEST_CITY_MARKET_ID DEST ... TAXI_IN ARR_TIME ARR_DELAY CANCELLED CANCELLATION_CODE CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY
0 2014-07-01 AA N002AA 1105703 31057 CLT Charlotte, NC 1129803 30194 DFW ... 28.0 1214.0 9.0 0.0 NaN NaN NaN NaN NaN NaN
1 2014-07-01 AA N002AA 1129803 30194 DFW Dallas/Fort Worth, TX 1105703 31057 CLT ... 13.0 945.0 0.0 0.0 NaN NaN NaN NaN NaN NaN
2 2014-07-01 AA N004AA 1039705 30397 ATL Atlanta, GA 1129803 30194 DFW ... 6.0 1341.0 -9.0 0.0 NaN NaN NaN NaN NaN NaN
3 2014-07-01 AA N004AA 1129803 30194 DFW Dallas/Fort Worth, TX 1039705 30397 ATL ... 7.0 1159.0 4.0 0.0 NaN NaN NaN NaN NaN NaN
4 2014-07-01 AA N004AA 1129803 30194 DFW Dallas/Fort Worth, TX 1039705 30397 ATL ... 6.0 2317.0 2.0 0.0 NaN NaN NaN NaN NaN NaN

5 rows × 24 columns

In [3]:
df.describe()
Out[3]:
ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_MARKET_ID DEST_AIRPORT_SEQ_ID DEST_CITY_MARKET_ID DEP_TIME DEP_DELAY TAXI_OUT TAXI_IN ARR_TIME ARR_DELAY CANCELLED CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY
count 1.292656e+07 1.292656e+07 1.292656e+07 1.292656e+07 1.271494e+07 1.271486e+07 1.271048e+07 1.270744e+07 1.270744e+07 1.268139e+07 1.292656e+07 2.559603e+06 2.559603e+06 2.559603e+06 2.559603e+06 2.559603e+06
mean 1.298824e+06 3.161338e+04 1.298812e+06 3.161370e+04 1.332409e+03 1.127152e+01 1.722694e+01 8.678440e+00 1.471801e+03 5.409690e+00 1.675551e-02 2.041558e+01 2.650336e+00 1.520350e+01 1.268451e-01 2.413592e+01
std 1.453419e+05 1.168922e+03 1.453231e+05 1.168763e+03 5.192380e+02 4.514114e+01 9.141093e+00 6.704624e+00 5.554134e+02 4.738376e+01 1.283540e-01 5.935956e+01 2.208897e+01 3.191662e+01 3.210375e+00 5.163883e+01
min 1.039705e+06 3.019400e+04 1.039705e+06 3.019400e+04 1.000000e+00 -2.340000e+02 0.000000e+00 1.000000e+00 1.000000e+00 -2.380000e+02 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
25% 1.129806e+06 3.046600e+04 1.129806e+06 3.046600e+04 9.060000e+02 -5.000000e+00 1.200000e+01 5.000000e+00 1.052000e+03 -1.400000e+01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
50% 1.289208e+06 3.145400e+04 1.289208e+06 3.145400e+04 1.324000e+03 -1.000000e+00 1.500000e+01 7.000000e+00 1.515000e+03 -5.000000e+00 0.000000e+00 3.000000e+00 0.000000e+00 2.000000e+00 0.000000e+00 0.000000e+00
75% 1.410702e+06 3.245700e+04 1.410702e+06 3.245700e+04 1.751000e+03 9.000000e+00 2.000000e+01 1.000000e+01 1.930000e+03 9.000000e+00 0.000000e+00 2.000000e+01 0.000000e+00 1.900000e+01 0.000000e+00 2.800000e+01
max 1.501606e+06 3.481900e+04 1.501606e+06 3.481900e+04 2.400000e+03 3.695000e+03 2.270000e+02 1.419000e+03 2.400000e+03 3.680000e+03 1.000000e+00 3.359000e+03 2.692000e+03 1.511000e+03 9.870000e+02 3.581000e+03
In [4]:
df.FL_DATE.max()
Out[4]:
'2024-06-30'
In [5]:
df.nunique()
Out[5]:
FL_DATE                  3653
OP_UNIQUE_CARRIER          20
TAIL_NUM                 9140
ORIGIN_AIRPORT_SEQ_ID      58
ORIGIN_CITY_MARKET_ID      20
ORIGIN                     24
ORIGIN_CITY_NAME           24
DEST_AIRPORT_SEQ_ID        58
DEST_CITY_MARKET_ID        20
DEST                       24
DEST_CITY_NAME             24
DEP_TIME                 1440
DEP_DELAY                1783
TAXI_OUT                  195
TAXI_IN                   247
ARR_TIME                 1440
ARR_DELAY                1806
CANCELLED                   2
CANCELLATION_CODE           4
CARRIER_DELAY            1558
WEATHER_DELAY             971
NAS_DELAY                 855
SECURITY_DELAY            268
LATE_AIRCRAFT_DELAY      1228
dtype: int64
In [6]:
df.dtypes
Out[6]:
FL_DATE                   object
OP_UNIQUE_CARRIER         object
TAIL_NUM                  object
ORIGIN_AIRPORT_SEQ_ID      int64
ORIGIN_CITY_MARKET_ID      int64
ORIGIN                    object
ORIGIN_CITY_NAME          object
DEST_AIRPORT_SEQ_ID        int64
DEST_CITY_MARKET_ID        int64
DEST                      object
DEST_CITY_NAME            object
DEP_TIME                 float64
DEP_DELAY                float64
TAXI_OUT                 float64
TAXI_IN                  float64
ARR_TIME                 float64
ARR_DELAY                float64
CANCELLED                float64
CANCELLATION_CODE         object
CARRIER_DELAY            float64
WEATHER_DELAY            float64
NAS_DELAY                float64
SECURITY_DELAY           float64
LATE_AIRCRAFT_DELAY      float64
dtype: object

Data Cleaning¶

In [7]:
MEDIUM_AIRPORT_CODE = [
    "DAL",
    "PDX",
    "STL",
    "RDU",
    "HOU",
    "SMF",
    "MSY",
    "SJC",
    "SJU",
    "SNA"
]
In [8]:
import matplotlib.pyplot as plt
from datetime import time

target_df = df.drop(columns = [
    "OP_UNIQUE_CARRIER",
    "TAIL_NUM",
    "ORIGIN_AIRPORT_SEQ_ID",
    "DEST_AIRPORT_SEQ_ID",
    "TAXI_OUT",
    "TAXI_IN",
])
del df
In [9]:
import numpy as np

target_df['FL_DATE'] = pd.to_datetime(target_df['FL_DATE'])
target_df['ARR_TIME'] = target_df['ARR_TIME'].apply(
    lambda x: x if np.isnan(x) else time(hour=int(x)//100%24, minute=int(x)%100)
)
target_df['DEP_TIME'] = target_df['DEP_TIME'].apply(
    lambda x: x if np.isnan(x) else time(hour=int(x)//100%24, minute=int(x)%100)
)
In [10]:
airport_set_df = target_df.ORIGIN.drop_duplicates()
airport_review = pd.read_csv("../supplementary/Airport-Reviews-Table.csv")
airport_review = airport_review[airport_review.AIRPORT_CODE.isin(airport_set_df)]

airport_default_score = airport_review.groupby("AIRPORT_CODE")[[
    'ratingValue',
    'queueTime', 
    'terminalCleanliness', 
    'terminalSeating', 
    'terminalSign', 
    'foodBeverage', 
    'airportShopping', 
    'wifiConnectivity', 
    'airportStaff', 
    'sentiment'
]].mean().reset_index()

airport_review_count = airport_review.groupby("AIRPORT_CODE").size().reset_index().rename(columns = {0:"reviewCount"})
airport_review_merged = pd.merge(airport_review_count, airport_default_score, how = "inner", on = "AIRPORT_CODE")
airport_review_merged['Large'] = ~airport_review_merged.AIRPORT_CODE.isin(MEDIUM_AIRPORT_CODE)
airport_review_merged.head()
Out[10]:
AIRPORT_CODE reviewCount ratingValue queueTime terminalCleanliness terminalSeating terminalSign foodBeverage airportShopping wifiConnectivity airportStaff sentiment Large
0 ATL 333 0.256456 1.939940 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True
1 CLT 169 0.242012 1.958580 2.349112 2.019481 2.625767 1.951220 2.468468 2.100917 1.947020 -0.715976 True
2 DAL 26 0.450000 3.173913 3.695652 3.150000 3.478261 3.304348 3.090909 3.714286 2.875000 -0.153846 False
3 DEN 151 0.260265 1.847682 2.536424 2.253731 2.213333 2.006757 2.308511 2.557895 1.934307 -0.761589 True
4 DFW 175 0.321143 2.298851 2.908046 2.771429 2.812865 2.179191 2.793478 2.647619 2.102564 -0.531429 True
In [11]:
target_airport_df = pd.merge(
    target_df, 
    airport_review_merged,
    how = "inner", 
    left_on = "ORIGIN", 
    right_on = "AIRPORT_CODE",
    suffixes = ["", "_origin"]
)
target_airport_df = pd.merge(
    target_airport_df, 
    airport_review_merged,
    how = "inner", 
    left_on = "DEST", 
    right_on = "AIRPORT_CODE",
    suffixes = ["", "_dest"]
)
target_airport_df.rename(columns = {
    'reviewCount':'Origin Review Count', 
    'ratingValue':'Origin Rating Value', 
    'queueTime':'Origin Queue Time',
    'terminalCleanliness':'Origin Terminal Cleanliness', 
    'terminalSeating':'Origin Terminal Seating', 
    'terminalSign':'Origin Terminal Sign',
    'foodBeverage':'Origin Food Beverage', 
    'airportShopping':'Origin Airport Shopping', 
    'wifiConnectivity':'Origin Wifi Connectivity', 
    'airportStaff':'Origin Airport Staff',
    'sentiment':'Origin Sentiment', 
    'Large':'Origin Busy',
    'reviewCount_dest':'Destination Review Count', 
    'ratingValue_dest':'Destination Rating Value', 
    'queueTime_dest':'Destination Queue Time',
    'terminalCleanliness_dest':'Destination Terminal Cleanliness', 
    'terminalSeating_dest':'Destination Terminal Seating', 
    'terminalSign_dest':'Destination Terminal Sign',
    'foodBeverage_dest':'Destination Food Beverage', 
    'airportShopping_dest':'Destination Airport Shopping', 
    'wifiConnectivity_dest':'Destination Wifi Connectivity', 
    'airportStaff_dest':'Destination Airport Staff',
    'sentiment_dest':'Destination Sentiment', 
    'Large_dest':'Destination Busy'
}, inplace = True)
target_airport_df.drop(columns = ['AIRPORT_CODE', 'AIRPORT_CODE_dest'], inplace = True)
target_airport_df['CANCELLATION_CODE'] = target_airport_df['CANCELLATION_CODE'].fillna("Not")
target_airport_df.head()
Out[11]:
FL_DATE ORIGIN_CITY_MARKET_ID ORIGIN ORIGIN_CITY_NAME DEST_CITY_MARKET_ID DEST DEST_CITY_NAME DEP_TIME DEP_DELAY ARR_TIME ... Destination Queue Time Destination Terminal Cleanliness Destination Terminal Seating Destination Terminal Sign Destination Food Beverage Destination Airport Shopping Destination Wifi Connectivity Destination Airport Staff Destination Sentiment Destination Busy
0 2014-07-01 31057 CLT Charlotte, NC 30194 DFW Dallas/Fort Worth, TX 10:25:00 -5.0 12:14:00 ... 2.298851 2.908046 2.771429 2.812865 2.179191 2.793478 2.647619 2.102564 -0.531429 True
1 2014-07-01 30194 DFW Dallas/Fort Worth, TX 31057 CLT Charlotte, NC 06:21:00 -4.0 09:45:00 ... 1.958580 2.349112 2.019481 2.625767 1.951220 2.468468 2.100917 1.947020 -0.715976 True
2 2014-07-01 30397 ATL Atlanta, GA 30194 DFW Dallas/Fort Worth, TX 12:38:00 -2.0 13:41:00 ... 2.298851 2.908046 2.771429 2.812865 2.179191 2.793478 2.647619 2.102564 -0.531429 True
3 2014-07-01 30194 DFW Dallas/Fort Worth, TX 30397 ATL Atlanta, GA 09:04:00 14.0 11:59:00 ... 1.939940 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True
4 2014-07-01 30194 DFW Dallas/Fort Worth, TX 30397 ATL Atlanta, GA 20:16:00 1.0 23:17:00 ... 1.939940 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True

5 rows × 42 columns

In [12]:
target_airport_df
Out[12]:
FL_DATE ORIGIN_CITY_MARKET_ID ORIGIN ORIGIN_CITY_NAME DEST_CITY_MARKET_ID DEST DEST_CITY_NAME DEP_TIME DEP_DELAY ARR_TIME ... Destination Queue Time Destination Terminal Cleanliness Destination Terminal Seating Destination Terminal Sign Destination Food Beverage Destination Airport Shopping Destination Wifi Connectivity Destination Airport Staff Destination Sentiment Destination Busy
0 2014-07-01 31057 CLT Charlotte, NC 30194 DFW Dallas/Fort Worth, TX 10:25:00 -5.0 12:14:00 ... 2.298851 2.908046 2.771429 2.812865 2.179191 2.793478 2.647619 2.102564 -0.531429 True
1 2014-07-01 30194 DFW Dallas/Fort Worth, TX 31057 CLT Charlotte, NC 06:21:00 -4.0 09:45:00 ... 1.958580 2.349112 2.019481 2.625767 1.951220 2.468468 2.100917 1.947020 -0.715976 True
2 2014-07-01 30397 ATL Atlanta, GA 30194 DFW Dallas/Fort Worth, TX 12:38:00 -2.0 13:41:00 ... 2.298851 2.908046 2.771429 2.812865 2.179191 2.793478 2.647619 2.102564 -0.531429 True
3 2014-07-01 30194 DFW Dallas/Fort Worth, TX 30397 ATL Atlanta, GA 09:04:00 14.0 11:59:00 ... 1.939940 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True
4 2014-07-01 30194 DFW Dallas/Fort Worth, TX 30397 ATL Atlanta, GA 20:16:00 1.0 23:17:00 ... 1.939940 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
12926551 2018-12-31 31057 CLT Charlotte, NC 31703 EWR Newark, NJ 10:06:00 -9.0 11:49:00 ... 1.778146 2.258278 1.988806 2.344595 1.568562 1.817352 1.980000 1.562724 -0.801325 True
12926552 2018-12-31 31703 EWR Newark, NJ 31057 CLT Charlotte, NC 14:48:00 -7.0 16:59:00 ... 1.958580 2.349112 2.019481 2.625767 1.951220 2.468468 2.100917 1.947020 -0.715976 True
12926553 2018-12-31 31703 JFK New York, NY 30194 DFW Dallas/Fort Worth, TX 18:21:00 1.0 21:55:00 ... 2.298851 2.908046 2.771429 2.812865 2.179191 2.793478 2.647619 2.102564 -0.531429 True
12926554 2018-12-31 31703 JFK New York, NY 30977 ORD Chicago, IL 07:54:00 -5.0 10:01:00 ... 2.065789 2.377193 2.107692 2.423423 1.879464 2.303448 2.165605 1.880383 -0.666667 True
12926555 2018-12-31 30977 ORD Chicago, IL 31703 JFK New York, NY 10:37:00 -3.0 13:52:00 ... 1.784689 2.327751 2.082111 2.387019 1.429952 2.061594 2.223776 1.635204 -0.742243 True

12926556 rows × 42 columns

Visualization¶

Delay Trend¶

In [12]:
delay_by_year
Out[12]:
Year variable Delay (m)
0 2014 00 DEP_DELAY 10.378665
1 2015 00 DEP_DELAY 10.414516
2 2016 00 DEP_DELAY 10.474933
3 2017 00 DEP_DELAY 10.878647
4 2018 00 DEP_DELAY 10.739813
... ... ... ...
61 2020 00 LATE_AIRCRAFT_DELAY 16.810856
62 2021 00 LATE_AIRCRAFT_DELAY 22.448871
63 2022 00 LATE_AIRCRAFT_DELAY 24.445999
64 2023 00 LATE_AIRCRAFT_DELAY 27.877123
65 2024 00 LATE_AIRCRAFT_DELAY 29.435622

66 rows × 3 columns

In [13]:
import plotly.express as px

delay_by_year = target_airport_df[[
    'FL_DATE',
    "DEP_DELAY",
    'CARRIER_DELAY',
    'WEATHER_DELAY', 
    'NAS_DELAY', 
    'SECURITY_DELAY',
    'LATE_AIRCRAFT_DELAY',
]]
delay_by_year['Year'] = delay_by_year['FL_DATE'].dt.strftime("%Y %m")
delay_by_year = delay_by_year.groupby("Year")[[
    "DEP_DELAY",
    'CARRIER_DELAY',
    'WEATHER_DELAY', 
    'NAS_DELAY', 
    'SECURITY_DELAY',
    'LATE_AIRCRAFT_DELAY',
]].mean().reset_index()
delay_by_year = delay_by_year.melt(id_vars = "Year", value_name = "Delay (m)")

fig = px.area(
    delay_by_year, 
    x="Year", 
    y="Delay (m)",
    color ='variable',
    title='Airport Delay Trend in the Past 10Y',
)
fig.update_layout(
    height=600,
)
fig.show()
C:\Users\wongh\AppData\Local\Temp\ipykernel_22092\870454222.py:12: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Airport Review¶

First of all, let's have a look at the Review Data we collected

In [13]:
airport_review_merged.head()
Out[13]:
AIRPORT_CODE reviewCount ratingValue queueTime terminalCleanliness terminalSeating terminalSign foodBeverage airportShopping wifiConnectivity airportStaff sentiment Large
0 ATL 333 0.256456 1.939940 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True
1 CLT 169 0.242012 1.958580 2.349112 2.019481 2.625767 1.951220 2.468468 2.100917 1.947020 -0.715976 True
2 DAL 26 0.450000 3.173913 3.695652 3.150000 3.478261 3.304348 3.090909 3.714286 2.875000 -0.153846 False
3 DEN 151 0.260265 1.847682 2.536424 2.253731 2.213333 2.006757 2.308511 2.557895 1.934307 -0.761589 True
4 DFW 175 0.321143 2.298851 2.908046 2.771429 2.812865 2.179191 2.793478 2.647619 2.102564 -0.531429 True
In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

corr_matrix = airport_review_merged.loc[:, 'reviewCount':].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', mask = mask)
plt.title('Airport Review Score Correlation')
plt.show()
No description has been provided for this image

Summary

The majority of the score given by users are very correlated,
with the exception of reviewCount of an airport, which inversely correlate with all other attributes.

Airport Ranking¶

In [16]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots 
In [61]:
fig = go.Figure()

fig = px.scatter(
    data_frame = airport_review_merged,
    x='sentiment', 
    y='ratingValue',
    color='ratingValue',
    size = 'reviewCount',
    color_continuous_scale = 'viridis',
    facet_col = "Large",
    text='AIRPORT_CODE',
    log_y = True
)
fig.update_traces(textposition='top center')
fig.for_each_annotation(
    lambda a: a.update(text= "Medium Airport" if a.text == "Large=False" else "Large Airport")
)
fig.update_layout(
    height=600, 
    width=1000, 
    title_text="Airport Sentiment Visualization", 
    showlegend=False
)
fig.show()

Review Performance per Airport¶

In [18]:
df_columns = [
    'queueTime',
    'terminalCleanliness',
    'terminalSeating',
    'terminalSign',
    'foodBeverage',
    'airportShopping',
    'wifiConnectivity',
    'airportStaff',
]
df_better_names = [
    'Queue Time',
    'Terminal Cleanliness',
    'Terminal Seating',
    'Terminal Sign',
    'Food Beverage',
    'Airport Shopping',
    'Wifi Connectivity',
    'Airport Staff',
]
fig = make_subplots(rows=4, cols=2, subplot_titles=df_better_names)
col = [1, 2]*4
row = [1, 1, 2, 2, 3, 3, 4, 4]
for c, r, column, better_column in zip(col, row, df_columns, df_better_names):
    airport_review_merged.sort_values(column, ascending = False, inplace = True)
    trace = go.Bar(
        x=airport_review_merged['AIRPORT_CODE'], 
        y=airport_review_merged[column],
        marker=dict(
            color = airport_review_merged[column],
            colorscale='viridis'
        )
    )
    fig.add_trace(
        trace, 
        row=r, 
        col=c
    )
fig.update_layout(
    height=1000, 
    title_text="Individual Scoring Attributes of Airports",
    showlegend=False
)
fig.show()

Delay vs Ranking¶

In [19]:
target_airport_df.columns
Out[19]:
Index(['FL_DATE', 'ORIGIN_CITY_MARKET_ID', 'ORIGIN', 'ORIGIN_CITY_NAME',
       'DEST_CITY_MARKET_ID', 'DEST', 'DEST_CITY_NAME', 'DEP_TIME',
       'DEP_DELAY', 'ARR_TIME', 'ARR_DELAY', 'CANCELLED', 'CANCELLATION_CODE',
       'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY',
       'LATE_AIRCRAFT_DELAY', 'Origin Review Count', 'Origin Rating Value',
       'Origin Queue Time', 'Origin Terminal Cleanliness',
       'Origin Terminal Seating', 'Origin Terminal Sign',
       'Origin Food Beverage', 'Origin Airport Shopping',
       'Origin Wifi Connectivity', 'Origin Airport Staff', 'Origin Sentiment',
       'Origin Busy', 'Destination Review Count', 'Destination Rating Value',
       'Destination Queue Time', 'Destination Terminal Cleanliness',
       'Destination Terminal Seating', 'Destination Terminal Sign',
       'Destination Food Beverage', 'Destination Airport Shopping',
       'Destination Wifi Connectivity', 'Destination Airport Staff',
       'Destination Sentiment', 'Destination Busy'],
      dtype='object')
In [30]:
train_columns = [
    'Origin Rating Value', 
    'Origin Queue Time',
    'Origin Terminal Cleanliness', 
    'Origin Terminal Seating', 
    'Origin Terminal Sign',
    'Origin Food Beverage', 
    'Origin Airport Shopping', 
    'Origin Wifi Connectivity', 
    'Origin Airport Staff',
    'Origin Sentiment', 
    'Origin Busy',
    'Destination Rating Value', 
    'Destination Queue Time',
    'Destination Terminal Cleanliness', 
    'Destination Terminal Seating', 
    'Destination Terminal Sign',
    'Destination Food Beverage', 
    'Destination Airport Shopping', 
    'Destination Wifi Connectivity', 
    'Destination Airport Staff',
    'Destination Sentiment', 
    'Destination Busy'
]
label_columns = [
    "DEP_DELAY",
    "ARR_DELAY",
    "CARRIER_DELAY",
    "WEATHER_DELAY",
    "NAS_DELAY",
    "SECURITY_DELAY",
    "LATE_AIRCRAFT_DELAY",
]

Delay Data Size >0

In [32]:
cal_df = target_airport_df[['ORIGIN'] + label_columns]
delay = cal_df.groupby('ORIGIN')[label_columns].apply(lambda x: (x > 0).mean()*100).reset_index().sort_values("ARR_DELAY", ascending = False)
big_delay = cal_df.groupby('ORIGIN')[label_columns].apply(lambda x: (x > 60).mean()*100).reset_index().sort_values("ARR_DELAY", ascending = False)
In [53]:
import pandas as pd
import matplotlib.pyplot as plt

df1 = delay
df2 = big_delay

fig, ax = plt.subplots(figsize=(12, 6))

bar_width = 0.35
index = range(len(df1))
bars3 = ax.bar([i for i in index], [100 for _ in range(len(df2['ARR_DELAY']))], bar_width, label='All Flight', color='royalblue')
bars1 = ax.bar(index, df1['ARR_DELAY'], bar_width, label='Arrival Delay > 0', color='pink')
bars2 = ax.bar([i for i in index], df2['ARR_DELAY'], bar_width, label='Arrival Delay > 60', color='red')


# Adding labels, title, and legend
ax.set_xlabel('Airport Code')
ax.set_ylabel('Percentage (%)')
ax.set_title('(%) of Delay Flights by Airport')
ax.set_xticks([i + bar_width / 2 for i in index])
ax.set_xticklabels(df1['ORIGIN'])
ax.legend()

plt.savefig("Delay_by_Airport")
No description has been provided for this image
In [13]:
target_airport_df.columns
Out[13]:
Index(['FL_DATE', 'ORIGIN_CITY_MARKET_ID', 'ORIGIN', 'ORIGIN_CITY_NAME',
       'DEST_CITY_MARKET_ID', 'DEST', 'DEST_CITY_NAME', 'DEP_DELAY',
       'ARR_DELAY', 'CANCELLED', 'CANCELLATION_CODE', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
       'Origin Review Count', 'Origin Rating Value', 'Origin Queue Time',
       'Origin Terminal Cleanliness', 'Origin Terminal Seating',
       'Origin Terminal Sign', 'Origin Food Beverage',
       'Origin Airport Shopping', 'Origin Wifi Connectivity',
       'Origin Airport Staff', 'Origin Sentiment', 'Origin Busy',
       'Destination Review Count', 'Destination Rating Value',
       'Destination Queue Time', 'Destination Terminal Cleanliness',
       'Destination Terminal Seating', 'Destination Terminal Sign',
       'Destination Food Beverage', 'Destination Airport Shopping',
       'Destination Wifi Connectivity', 'Destination Airport Staff',
       'Destination Sentiment', 'Destination Busy'],
      dtype='object')
In [51]:
target_airport_df = target_airport_df[(target_airport_df['FL_DATE'].dt.month >= 5) & (target_airport_df['FL_DATE'].dt.month <= 8)]
target_airport_df.nunique()
Out[51]:
FL_DATE                             1230
ORIGIN_CITY_MARKET_ID                 20
ORIGIN                                24
ORIGIN_CITY_NAME                      24
DEST_CITY_MARKET_ID                   20
DEST                                  24
DEST_CITY_NAME                        24
DEP_TIME                            1440
DEP_DELAY                           1513
ARR_TIME                            1440
ARR_DELAY                           1563
CANCELLED                              2
CANCELLATION_CODE                      5
CARRIER_DELAY                       1317
WEATHER_DELAY                        716
NAS_DELAY                            684
SECURITY_DELAY                       200
LATE_AIRCRAFT_DELAY                 1063
Origin Review Count                   23
Origin Rating Value                   24
Origin Queue Time                     24
Origin Terminal Cleanliness           23
Origin Terminal Seating               24
Origin Terminal Sign                  24
Origin Food Beverage                  24
Origin Airport Shopping               24
Origin Wifi Connectivity              23
Origin Airport Staff                  24
Origin Sentiment                      23
Origin Busy                            2
Destination Review Count              23
Destination Rating Value              24
Destination Queue Time                24
Destination Terminal Cleanliness      23
Destination Terminal Seating          24
Destination Terminal Sign             24
Destination Food Beverage             24
Destination Airport Shopping          24
Destination Wifi Connectivity         23
Destination Airport Staff             24
Destination Sentiment                 23
Destination Busy                       2
dtype: int64
In [17]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

corr_matrix = target_airport_df.loc[:, train_columns + label_columns].corr()

plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays')
plt.show()

plt.figure(figsize=(10, 5))
label_columns.remove('NAS_DELAY')
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Delays (Without NAS)')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [40]:
import plotly.express as px

fig = px.box(
    target_airport_df.sample(1000000, replace = False).sort_values(["Destination Busy", "NAS_DELAY"]), 
    x="DEST", 
    y="NAS_DELAY",
    log_y = True,
    color = f"Destination Busy",
    color_discrete_map = {True: 'red', False: 'blue'}
)
fig.update_layout(
    height=400, 
    title_text=f"Destination Aiport vs NAS Delay",
    showlegend=True
)
fig.show()

Departure Delay Airport Route¶

In [56]:
target_airport_df
Out[56]:
FL_DATE ORIGIN_CITY_MARKET_ID ORIGIN ORIGIN_CITY_NAME DEST_CITY_MARKET_ID DEST DEST_CITY_NAME DEP_TIME DEP_DELAY ARR_TIME ... Destination Terminal Cleanliness Destination Terminal Seating Destination Terminal Sign Destination Food Beverage Destination Airport Shopping Destination Wifi Connectivity Destination Airport Staff Destination Sentiment Destination Busy Route
0 2014-07-01 31057 CLT Charlotte, NC 30194 DFW Dallas/Fort Worth, TX 1025.0 -5.0 1214.0 ... 2.908046 2.771429 2.812865 2.179191 2.793478 2.647619 2.102564 -0.531429 True Charlotte, NC TO Dallas/Fort Worth, TX
1 2014-07-01 30194 DFW Dallas/Fort Worth, TX 31057 CLT Charlotte, NC 621.0 -4.0 945.0 ... 2.349112 2.019481 2.625767 1.951220 2.468468 2.100917 1.947020 -0.715976 True Dallas/Fort Worth, TX TO Charlotte, NC
2 2014-07-01 30397 ATL Atlanta, GA 30194 DFW Dallas/Fort Worth, TX 1238.0 -2.0 1341.0 ... 2.908046 2.771429 2.812865 2.179191 2.793478 2.647619 2.102564 -0.531429 True Atlanta, GA TO Dallas/Fort Worth, TX
3 2014-07-01 30194 DFW Dallas/Fort Worth, TX 30397 ATL Atlanta, GA 904.0 14.0 1159.0 ... 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True Dallas/Fort Worth, TX TO Atlanta, GA
4 2014-07-01 30194 DFW Dallas/Fort Worth, TX 30397 ATL Atlanta, GA 2016.0 1.0 2317.0 ... 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True Dallas/Fort Worth, TX TO Atlanta, GA
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
12130797 2019-07-31 31057 CLT Charlotte, NC 31703 EWR Newark, NJ 2035.0 245.0 2218.0 ... 2.258278 1.988806 2.344595 1.568562 1.817352 1.980000 1.562724 -0.801325 True Charlotte, NC TO Newark, NJ
12130798 2019-07-31 30325 DEN Denver, CO 30397 ATL Atlanta, GA 1332.0 -3.0 1834.0 ... 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True Denver, CO TO Atlanta, GA
12130799 2019-07-31 30397 ATL Atlanta, GA 30325 DEN Denver, CO 1342.0 -3.0 1454.0 ... 2.536424 2.253731 2.213333 2.006757 2.308511 2.557895 1.934307 -0.761589 True Atlanta, GA TO Denver, CO
12130800 2019-07-31 30325 DEN Denver, CO 30397 ATL Atlanta, GA 747.0 -3.0 1238.0 ... 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True Denver, CO TO Atlanta, GA
12130801 2019-07-31 30977 ORD Chicago, IL 30397 ATL Atlanta, GA 2103.0 -7.0 2359.0 ... 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True Chicago, IL TO Atlanta, GA

4434224 rows × 43 columns

In [16]:
target_airport_df['Route'] = target_airport_df.apply(lambda row: row['ORIGIN_CITY_NAME'] + " -> " + row['DEST_CITY_NAME'], axis = 1)
target_airport_df.head()
Out[16]:
FL_DATE ORIGIN_CITY_MARKET_ID ORIGIN ORIGIN_CITY_NAME DEST_CITY_MARKET_ID DEST DEST_CITY_NAME DEP_TIME DEP_DELAY ARR_TIME ... Destination Terminal Cleanliness Destination Terminal Seating Destination Terminal Sign Destination Food Beverage Destination Airport Shopping Destination Wifi Connectivity Destination Airport Staff Destination Sentiment Destination Busy Route
0 2014-07-01 31057 CLT Charlotte, NC 30194 DFW Dallas/Fort Worth, TX 10:25:00 -5.0 12:14:00 ... 2.908046 2.771429 2.812865 2.179191 2.793478 2.647619 2.102564 -0.531429 True Charlotte, NC -> Dallas/Fort Worth, TX
1 2014-07-01 30194 DFW Dallas/Fort Worth, TX 31057 CLT Charlotte, NC 06:21:00 -4.0 09:45:00 ... 2.349112 2.019481 2.625767 1.951220 2.468468 2.100917 1.947020 -0.715976 True Dallas/Fort Worth, TX -> Charlotte, NC
2 2014-07-01 30397 ATL Atlanta, GA 30194 DFW Dallas/Fort Worth, TX 12:38:00 -2.0 13:41:00 ... 2.908046 2.771429 2.812865 2.179191 2.793478 2.647619 2.102564 -0.531429 True Atlanta, GA -> Dallas/Fort Worth, TX
3 2014-07-01 30194 DFW Dallas/Fort Worth, TX 30397 ATL Atlanta, GA 09:04:00 14.0 11:59:00 ... 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True Dallas/Fort Worth, TX -> Atlanta, GA
4 2014-07-01 30194 DFW Dallas/Fort Worth, TX 30397 ATL Atlanta, GA 20:16:00 1.0 23:17:00 ... 2.750751 2.390511 2.623853 1.784195 2.450777 2.613333 1.754902 -0.741742 True Dallas/Fort Worth, TX -> Atlanta, GA

5 rows × 43 columns

In [17]:
target_airport_df_temp = target_airport_df.groupby('Route').size().reset_index().rename(columns = {0: 'DEP_DELAY'})
target_airport_df_temp.sort_values("DEP_DELAY", ascending = False, inplace = True)
target_airport_df_temp = target_airport_df_temp.iloc[:10, :]
In [18]:
import plotly.express as px
import numpy as np

np.random.seed(1)


top10_busy_route = pd.merge(
    target_airport_df, 
    target_airport_df_temp, 
    how = "inner", 
    on = 'Route', 
    suffixes = ["", "_mean"]
).sample(100000, replace = False)
fig = px.box(
    top10_busy_route.sort_values("DEP_DELAY_mean", ascending = False), 
    x="Route", 
    y="DEP_DELAY",
    log_y = True,
    color_discrete_map = {True: 'red', False: 'blue'}
)
fig.update_layout(
    height=1000, 
    title_text=f"Aiport Departure Delay",
    showlegend=True
)
fig.show()

Cancelled¶

In [16]:
train_columns = [
    'Origin Rating Value', 
    'Origin Queue Time',
    'Origin Terminal Cleanliness', 
    'Origin Terminal Seating', 
    'Origin Terminal Sign',
    'Origin Food Beverage', 
    'Origin Airport Shopping', 
    'Origin Wifi Connectivity', 
    'Origin Airport Staff',
    'Origin Sentiment',
    'Origin Busy',
    'Destination Rating Value', 
    'Destination Queue Time',
    'Destination Terminal Cleanliness', 
    'Destination Terminal Seating', 
    'Destination Terminal Sign',
    'Destination Food Beverage', 
    'Destination Airport Shopping', 
    'Destination Wifi Connectivity', 
    'Destination Airport Staff',
    'Destination Sentiment', 
    'Destination Busy'
]
label_columns = [
    "Carrier",
    "Weather",
    "NAS",
    "Security",
    "Not"
]
In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

temp = pd.concat([
    target_airport_df, 
    pd.get_dummies(target_airport_df['CANCELLATION_CODE'], prefix = "Cancellation")
], axis = 1)

temp.rename(columns = {
    "Cancellation_A": "Carrier",
    "Cancellation_B": "Weather",
    "Cancellation_C": "NAS",
    "Cancellation_D": "Security",
    "Cancellation_Not": "Not"
}, inplace = True)

corr_matrix = temp.loc[:, train_columns + label_columns].corr()

plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Cancellation Reason')
plt.show()
In [14]:
temp.head()
Out[14]:
FL_DATE ORIGIN_CITY_MARKET_ID ORIGIN ORIGIN_CITY_NAME DEST_CITY_MARKET_ID DEST DEST_CITY_NAME DEP_TIME DEP_DELAY ARR_TIME ... Destination Airport Shopping Destination Wifi Connectivity Destination Airport Staff Destination Sentiment Destination Busy Carrier Weather NAS Security Not
0 2014-07-01 31057 CLT Charlotte, NC 30194 DFW Dallas/Fort Worth, TX 10:25:00 -5.0 12:14:00 ... 2.793478 2.647619 2.102564 -0.531429 True False False False False True
1 2014-07-01 30194 DFW Dallas/Fort Worth, TX 31057 CLT Charlotte, NC 06:21:00 -4.0 09:45:00 ... 2.468468 2.100917 1.947020 -0.715976 True False False False False True
2 2014-07-01 30397 ATL Atlanta, GA 30194 DFW Dallas/Fort Worth, TX 12:38:00 -2.0 13:41:00 ... 2.793478 2.647619 2.102564 -0.531429 True False False False False True
3 2014-07-01 30194 DFW Dallas/Fort Worth, TX 30397 ATL Atlanta, GA 09:04:00 14.0 11:59:00 ... 2.450777 2.613333 1.754902 -0.741742 True False False False False True
4 2014-07-01 30194 DFW Dallas/Fort Worth, TX 30397 ATL Atlanta, GA 20:16:00 1.0 23:17:00 ... 2.450777 2.613333 1.754902 -0.741742 True False False False False True

5 rows × 47 columns

In [17]:
corr_matrix = temp.loc[:, train_columns + label_columns].corr()

plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Cancellation Reason')
plt.show()
No description has been provided for this image
In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tqdm import tqdm
import numpy as np
import math
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize

def preprocess(df, y_columns='ARR_DELAY'):
    X = df.loc[:, train_columns]
    y = df.loc[:, y_columns]
    return X, y
In [ ]:
train, test = train_test_split(target_airport_df, test_size=0.2)
In [72]:
train_X, train_y = preprocess(train, "CANCELLATION_CODE")
test_X, test_y = preprocess(test, "CANCELLATION_CODE")

min_max_scaler = preprocessing.MinMaxScaler()
train_X_scaled = min_max_scaler.fit_transform(train_X)
test_X_scaled = min_max_scaler.transform(test_X)
In [73]:
model = LogisticRegression(
    multi_class='ovr', 
    max_iter=1000,
    class_weight="balanced"
)
model.fit(train_X_scaled, train_y)
C:\Users\wongh\anaconda3\envs\601\lib\site-packages\sklearn\linear_model\_logistic.py:1256: FutureWarning:

'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.

Out[73]:
LogisticRegression(class_weight='balanced', max_iter=1000, multi_class='ovr')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(class_weight='balanced', max_iter=1000, multi_class='ovr')
In [74]:
import seaborn as sn
import matplotlib.pyplot as plt

train_predictions = model.predict(train_X_scaled)
test_predictions = model.predict(test_X_scaled)

train_conf_matrix = confusion_matrix(train_y, train_predictions)
test_conf_matrix = confusion_matrix(test_y, test_predictions)

sn.heatmap(train_conf_matrix, annot=True, fmt='g')
plt.show()

sn.heatmap(test_conf_matrix, annot=True, fmt='g')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [76]:
train_y_encoded = label_binarize(train_y, classes=model.classes_)
test_y_encoded = label_binarize(test_y, classes=model.classes_)

train_probs = model.predict_proba(train_X_scaled)
test_probs = model.predict_proba(test_X_scaled)

fpr_train, tpr_train, _ = roc_curve(train_y_encoded.ravel(), train_probs.ravel())
fpr_test, tpr_test, _ = roc_curve(test_y_encoded.ravel(), test_probs.ravel())

train_auc = roc_auc_score(train_y_encoded, train_probs, average='micro')
test_auc = roc_auc_score(test_y_encoded, test_probs, average='micro')
In [77]:
def roc(fpr, tpr, auc):
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(
        fpr, 
        tpr, 
        label=f'Train ROC curve (area = {auc:.2})'
    )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Train ROC Curve')
    plt.legend(loc="lower right")

roc(fpr_train, tpr_train, train_auc)
roc(fpr_test, tpr_test, test_auc)
No description has been provided for this image
No description has been provided for this image
In [78]:
weight = model.coef_

fig, axs = plt.subplots(5, 1, figsize=(10, 20))

for i in range(5):
    ax = axs[i]
    ax.barh(train_columns, weight[i])
    ax.set_title(f'Bar Chart {label_columns[i]}')

plt.tight_layout()
plt.show()
No description has been provided for this image